Here we want to intersect the proteins identified in the 4 experiments we will publish: 1. 3 reps 400mJ/cm2 254nm UVC SILAC 2. TMT RNAse experiment 3. TMT dosage experiment 4. Cell cycle experiment
source("../../../R/Utility.R")
source("../../../R/Enrichment.R")
data.table 1.10.4.3
The fastest way to learn (by data.table authors): https://www.datacamp.com/courses/data-analysis-the-data-table-way
Documentation: ?data.table, example(data.table) and browseVignettes("data.table")
Release notes, videos and slides: http://r-datatable.com
Attaching package: ‘data.table’
The following object is masked from ‘package:GenomicRanges’:
shift
The following object is masked from ‘package:IRanges’:
shift
The following objects are masked from ‘package:S4Vectors’:
first, second
The following objects are masked from ‘package:dplyr’:
between, first, last
The following objects are masked from ‘package:reshape2’:
dcast, melt
Loading required package: BiasedUrn
Loading required package: geneLenDataBase
Loading required package: lattice
Loading required package: survival
Loading required package: Formula
Attaching package: ‘Hmisc’
The following object is masked from ‘package:gridExtra’:
combine
The following object is masked from ‘package:AnnotationDbi’:
contents
The following objects are masked from ‘package:Biobase’:
combine, contents
The following object is masked from ‘package:BiocGenerics’:
combine
The following objects are masked from ‘package:dplyr’:
combine, src, summarize
The following objects are masked from ‘package:base’:
format.pval, round.POSIXt, trunc.POSIXt, units
library("Biobase")
#source("https://bioconductor.org/biocLite.R")
#biocLite("MSnbase")
single_classification_df <- readRDS("../../../dLOPIT/pilot/notebooks/analysis/hyperLOPIT_Classifications.rds")
write.table(single_classification_df, "../../../dLOPIT/pilot/notebooks/analysis/hyperLOPIT_Classifications.tsv",
sep="\t", quote=F)
SILAC_df <- read.table("../../trizol_dosage/analysis/data_processing/enriched_proteins.tsv", sep="\t", header=TRUE)
SILAC_df <- SILAC_df[SILAC_df$dosage==400,]
print(length(unique(SILAC_df$master_protein)))
[1] 773
TMT_RNAse_df <- readRDS("../../TMT_4_step_3_step/notebooks/protein_quantification.Rds")
Loading required package: MSnbase
Loading required package: mzR
Loading required package: Rcpp
Loading required package: BiocParallel
Loading required package: ProtGenerics
This is MSnbase version 2.4.1
Visit https://lgatto.github.io/MSnbase/ to get started.
Attaching package: ‘MSnbase’
The following objects are masked from ‘package:Hmisc’:
impute, naplot
The following object is masked from ‘package:stats’:
smooth
The following object is masked from ‘package:base’:
trimws
print(head(data.frame(exprs(TMT_RNAse_df))))
print(length(rownames(exprs(TMT_RNAse_df))))
[1] 2540
TMT_dosage_df <- readRDS("../../TMT_4_step_dosage/analysis//protein_quantification.Rds")
print(head(data.frame(exprs(TMT_dosage_df))))
print(length(rownames(exprs(TMT_dosage_df))))
[1] 1505
TMT_dosage_repeat_df <- readRDS("../../TMT_4_step_dosage_repeat/analysis//protein_quantification.Rds")
print(head(data.frame(exprs(TMT_dosage_repeat_df))))
print(length(rownames(exprs(TMT_dosage_repeat_df))))
[1] 2116
TMT_cell_cycle <- readRDS("../../cell_cycle/data/rpb_res_pro_agg_norm_impute.rds")
print(head(data.frame(exprs(TMT_cell_cycle))))
print(length(rownames(exprs(TMT_cell_cycle))))
[1] 2198
SILAC_proteins <- as.character(unique(SILAC_df$master_protein))
TMT_RNAse_proteins <- as.character(rownames(exprs(TMT_RNAse_df)))
TMT_dosage_proteins <- as.character(rownames(exprs(TMT_dosage_df)))
TMT_dosage_rep_proteins <- as.character(rownames(exprs(TMT_dosage_repeat_df)))
TMT_cell_cycle_proteins <- as.character(rownames(exprs(TMT_cell_cycle)))
#all_proteins <- Reduce(union, list(SILAC_proteins, TMT_RNAse_proteins, TMT_dosage_proteins,
# TMT_dosage_rep_proteins, TMT_cell_cycle_proteins))
#cat_proteins <- c(SILAC_proteins, TMT_RNAse_proteins, TMT_dosage_proteins,
# TMT_dosage_rep_proteins, TMT_cell_cycle_proteins)
#all_proteins <- Reduce(union, list(SILAC_proteins, TMT_RNAse_proteins,
# TMT_dosage_rep_proteins, TMT_cell_cycle_proteins))
all_proteins <- Reduce(union, list(TMT_RNAse_proteins,
TMT_dosage_rep_proteins, TMT_cell_cycle_proteins))
#cat_proteins <- c(SILAC_proteins, TMT_RNAse_proteins, TMT_dosage_rep_proteins, TMT_cell_cycle_proteins)
cat_proteins <- c(TMT_RNAse_proteins, TMT_dosage_rep_proteins, TMT_cell_cycle_proteins)
print(length(all_proteins))
[1] 3019
protein_counts <- table(as.character(cat_proteins))
plot(hist(protein_counts))
print(length(protein_counts[protein_counts>1]))
[1] 2193
seen_once_proteins <- names(protein_counts[protein_counts==1])
seen_proteins <- names(protein_counts[protein_counts>0])
seen_twice_proteins <- names(protein_counts[protein_counts>1])
seen_three_proteins <- names(protein_counts[protein_counts==3])
seen_all_proteins <- names(protein_counts[protein_counts==3]) # the same as above now SILAC excluded
print('P25054' %in% cat_proteins) # APC
[1] TRUE
print('Q15717' %in% cat_proteins) # ELAV1/HuR
[1] TRUE
print(protein_counts[names(protein_counts)=='P25054'])
P25054
2
print(protein_counts[names(protein_counts)=='Q15717'])
Q15717
3
data.frame(protein_counts)
write.table(protein_counts, "protein_counts.tsv", sep="\t", quote=F, row.names=F)
go_RNA_binding_inf <- "../../../shared_files/uniprot_go/uniprot_go_RNA_binding.tsv"
glycoproteins_inf <- "../../../shared_files/glycoproteins.tsv"
signal_peptides_inf <- "../../../shared_files/signal_peptides.tsv"
go_RNA_binding_df <- read.table(go_RNA_binding_inf, sep="\t", header=T)
glycoproteins_df <- read.table(glycoproteins_inf, sep="\t", header=T)
signal_peptides_df <- read.table(signal_peptides_inf, sep="\t", header=T)
geiger_df <- read.csv("../../../shared_files/Geiger_et_al/Geiger_et_al_2012_U2OS_peptides_plus_master.tsv",
header=TRUE, sep="\t")
print(length(unique(geiger_df$master_protein)))
[1] 9463
geiger_df <- geiger_df[geiger_df$unique==1,]
print(length(unique(geiger_df$master_protein)))
[1] 7455
geiger_df <- geiger_df[geiger_df$master_protein!="",]
print(length(unique(geiger_df$master_protein)))
[1] 7454
geiger_df <- geiger_df[geiger_df$crap_protein==0,]
print(length(unique(geiger_df$master_protein)))
[1] 7454
geiger_df <- geiger_df[geiger_df$associated_crap_protein==0,]
print(length(unique(geiger_df$master_protein)))
[1] 7454
agg_geiger_max <- aggregate(list(geiger_df$Max, as.numeric(as.character(geiger_df$protein_length))),
by=list(geiger_df$master_protein), FUN=mean)
colnames(agg_geiger_max) <- c("master_protein", "Max", "Length")
agg_geiger_max$log_length <- log(agg_geiger_max$Length, 2)
agg_geiger_max$norm_log_length <- normrank(agg_geiger_max$log_length)
agg_geiger_max$norm_abundance <- normrank(agg_geiger_max$Max)
agg_geiger_max$harm_mean_bias <- apply(agg_geiger_max[,c("norm_log_length", "norm_abundance")], MARGIN =1, FUN=harm_mean)
agg_geiger_max <- merge(agg_geiger_max, data.frame(protein_counts),
by.x="master_protein", by.y="Var1", all.x=TRUE)
agg_geiger_max$Freq[is.na(agg_geiger_max$Freq)] <- 0
agg_geiger_max <- merge(agg_geiger_max, single_classification_df[,c("Accession", "svm", "svm.scores")],
by.x="master_protein", by.y="Accession", all.x=TRUE)
geiger_u2os_proteins <- unique(agg_geiger_max$master_protein)
RBPs <- unique(go_RNA_binding_df$Entry)
glycoproteins <- unique(glycoproteins_df$protein)
makeVennPlot(seen_once_proteins, RBPs, glycoproteins, subset_to=geiger_u2os_proteins)
makeVennPlot(seen_twice_proteins, RBPs, glycoproteins, subset_to=geiger_u2os_proteins)
makeVennPlot(seen_three_proteins, RBPs, glycoproteins, subset_to=geiger_u2os_proteins)
makeVennPlot(seen_all_proteins, RBPs, glycoproteins, subset_to=geiger_u2os_proteins)
qm <- myProtMapper(union(all_proteins, unique(agg_geiger_max$master_protein)))
Querying chunk 1
Querying chunk 2
Querying chunk 3
Querying chunk 4
Querying chunk 5
Querying chunk 6
Querying chunk 7
Querying chunk 8
Finished
Pass returnall=TRUE to return lists of duplicate or missing query terms.
[1] "No KEGG"
the condition has length > 1 and only the first element will be used
# Mapping Uniprot to GO or Uniprot to Interpro for goseq
uniprot.go <- makeGene2Cat(qm,"query","go.all",";")
polyA_binding <- uniprot.go[uniprot.go$to.id=="GO:0008143", "query"]
makeVennPlot(TMT_RNAse_proteins, TMT_dosage_rep_proteins, TMT_cell_cycle_proteins, set1_name = "RNAse", set2_name="Dosage", set3_name="Cell cycle", subset_to=geiger_u2os_proteins)
makeVennPlot(TMT_RNAse_proteins, TMT_dosage_rep_proteins, TMT_cell_cycle_proteins, set1_name = "RNAse", set2_name="Dosage", set3_name="Cell cycle")
signal_peptides <- unique(signal_peptides_df$accession)
makeVennPlot(seen_once_proteins, signal_peptides, glycoproteins, subset_to=geiger_u2os_proteins, set2_name="Signal peptide")
makeVennPlot(seen_twice_proteins, signal_peptides, glycoproteins, subset_to=geiger_u2os_proteins, set2_name="Signal peptide")
makeVennPlot(seen_three_proteins, signal_peptides, glycoproteins, subset_to=geiger_u2os_proteins, set2_name="Signal peptide")
makeVennPlot(seen_all_proteins, signal_peptides, glycoproteins, subset_to=geiger_u2os_proteins, set2_name="Signal peptide")
evi_signal_peptides <- unique(signal_peptides_df[signal_peptides_df$evidence=="True",]$accession)
makeVennPlot(seen_once_proteins, evi_signal_peptides, glycoproteins,
subset_to=geiger_u2os_proteins, set2_name="Signal peptide\n(with evidence)")
makeVennPlot(seen_twice_proteins, evi_signal_peptides, glycoproteins,
subset_to=geiger_u2os_proteins, set2_name="Signal peptide\n(with evidence)")
makeVennPlot(seen_three_proteins, evi_signal_peptides, glycoproteins,
subset_to=geiger_u2os_proteins, set2_name="Signal peptide\n(with evidence)")
makeVennPlot(seen_all_proteins, evi_signal_peptides, glycoproteins,
subset_to=geiger_u2os_proteins, set2_name="Signal peptide\n(with evidence)")
exp_signal_peptides <- unique(signal_peptides_df[signal_peptides_df$experimental=="True",]$accession)
makeVennPlot(seen_twice_proteins, exp_signal_peptides, glycoproteins,
subset_to=geiger_u2os_proteins, set2_name="Signal peptide\n(with exp. evidence)")
makeVennPlot(seen_once_proteins, exp_signal_peptides, glycoproteins,
subset_to=geiger_u2os_proteins, set2_name="Signal peptide\n(with exp. evidence)")
agg_geiger_max$glycoprotein <- agg_geiger_max$master_protein %in% unique(glycoproteins_df$protein)
agg_geiger_max$RBP <- agg_geiger_max$master_protein %in% unique(go_RNA_binding_df$Entry)
agg_geiger_max$seen <- agg_geiger_max$master_protein %in% all_proteins
agg_geiger_max$seen_once <- agg_geiger_max$master_protein %in% names(protein_counts[protein_counts==1])
agg_geiger_max$seen_twice <- agg_geiger_max$master_protein %in% names(protein_counts[protein_counts>1])
agg_geiger_max$seen_three <- agg_geiger_max$master_protein %in% names(protein_counts[protein_counts>2])
agg_geiger_max$seen_all <- agg_geiger_max$master_protein %in% names(protein_counts[protein_counts>3])
agg_geiger_max$seen_twice_glycoproteins <- (agg_geiger_max$seen_twice & agg_geiger_max$glycoprotein)
agg_geiger_max$signal <- agg_geiger_max$master_protein %in% signal_peptides
print(table(agg_geiger_max$signal))
FALSE TRUE
6909 545
print(agg_geiger_max)
saveRDS(agg_geiger_max, "agg_geiger_max.rds")
rows <- NULL
n=1
for(localisation in c("ER", "GOLGI", "LYSOSOME", "PM")){
if(is.na(localisation)){
next()}
print(localisation)
tmp_df <- agg_geiger_max[(agg_geiger_max$glycoprotein==T &
agg_geiger_max$svm==localisation),]
tmp_df<- tmp_df[!is.na(tmp_df$master_protein),]
print(table(tmp_df$signal, tmp_df$Freq))
print(table(tmp_df$Freq>0))
#for(freq in 1:4){ # SILAC REMOVED
for(freq in 1:3){
ft <- fisher.test(table(tmp_df$Freq>=freq, tmp_df$signal))
rows[[n]]<-c(localisation, freq, ft$p.value, ft$estimate, ft$conf.int[1:2])
n <- n+1
}
}
[1] "ER"
0 1 2 3
FALSE 29 5 4 7
TRUE 44 7 6 27
FALSE TRUE
73 56
[1] "GOLGI"
0 1 2 3
FALSE 28 3 1 1
TRUE 11 1 3 3
FALSE TRUE
39 12
[1] "LYSOSOME"
0 1 2 3
FALSE 3 2 1 2
TRUE 30 6 5 5
FALSE TRUE
33 21
[1] "PM"
0 1 2 3
FALSE 30 16 8 10
TRUE 39 13 19 35
FALSE TRUE
69 101
final_df <- data.frame(do.call("rbind", rows))
new_colnames <- c("localisation", "freq", "p.value", "estimate", "ci_low", "ci_high")
colnames(final_df) <- new_colnames
final_df[,new_colnames[2:6]] = apply(final_df[,new_colnames[2:6]], 2, function(x) as.numeric(as.character(x)))
p <- ggplot(final_df[final_df$localisation %in% c("PM", "ER"),], aes(freq, estimate, group=localisation, colour=localisation)) +
geom_point() + geom_line() + my_theme +
geom_line(aes(y=ci_low), colour="black", linetype=2) +
geom_line(aes(y=ci_high), colour="black", linetype=2) +
facet_wrap(~localisation, scales="free") +
expand_limits(y = 0) +
scale_colour_discrete(guide=F) +
xlab("Protein observed in at least n experiments") +
ylab("Signal peptide odds ratio")
print(p)
rows = NULL
n = 1
#for(freq in 1:4){
for(freq in 1:3){
tmp_df <- agg_geiger_max[agg_geiger_max$Freq>=freq,]
total = length(tmp_df$master_protein)
for(annot in c("RBP", "glycoprotein", "signal")){
obs <- sum(tmp_df[[annot]])
exp <- sum(agg_geiger_max[[annot]]) * (total/length(agg_geiger_max$master_protein))
over_rep <- obs/exp
print(freq>0)
#if(freq>0){
ft <- fisher.test(table(agg_geiger_max$Freq>=freq, agg_geiger_max[[annot]]))
rows[[n]]<-c(freq, annot, mean(as.numeric(tmp_df[[annot]])), exp, obs, over_rep,
ft$p.value, ft$estimate, ft$conf.int[1:2])
#}
#else{
# rows[[n]]<-c(freq, annot, mean(as.numeric(tmp_df[[annot]])), exp, obs, over_rep, NA, NA, NA)
#}
n <- n+1
}
#rows = NULL
agg_geiger_max_only_glyco <- agg_geiger_max[(agg_geiger_max$glycoprotein==T &
agg_geiger_max$svm=="PM"),]
#for(freq in 1:4){
for(freq in 1:3){
tmp_df <- agg_geiger_max_only_glyco[agg_geiger_max_only_glyco$Freq>=freq,]
total = length(tmp_df$master_protein)
obs <- sum(tmp_df[["signal"]])
exp <- sum(agg_geiger_max_only_glyco[["signal"]]) * (total/length(agg_geiger_max_only_glyco$master_protein))
over_rep <- obs/exp
if(freq>0){
ft <- fisher.test(table(agg_geiger_max_only_glyco$Freq>=freq, agg_geiger_max_only_glyco$signal))
rows[[n]]<-c(freq, "signal_glyco", mean(as.numeric(tmp_df[["signal"]])), exp, obs, over_rep,
ft$p.value, ft$estimate, ft$conf.int[1:2])
}
else{
rows[[n]]<-c(freq, "signal_glyco", mean(as.numeric(tmp_df[["signal"]])), exp, obs, over_rep, NA, NA, NA)
}
#print(rows[[n]])
print(freq)
print(dim(tmp_df))
print(exp)
print(obs)
n <- n+1
}
}
[1] TRUE
[1] TRUE
[1] TRUE
[1] 1
[1] 417 19
[1] NA
[1] NA
[1] 2
[1] 388 19
[1] NA
[1] NA
[1] 3
[1] 361 19
[1] NA
[1] NA
[1] TRUE
[1] TRUE
[1] TRUE
[1] 1
[1] 417 19
[1] NA
[1] NA
[1] 2
[1] 388 19
[1] NA
[1] NA
[1] 3
[1] 361 19
[1] NA
[1] NA
[1] TRUE
[1] TRUE
[1] TRUE
[1] 1
[1] 417 19
[1] NA
[1] NA
[1] 2
[1] 388 19
[1] NA
[1] NA
[1] 3
[1] 361 19
[1] NA
[1] NA
over_rep_df <- data.frame(do.call("rbind", rows))
colnames(over_rep_df) <- c("Freq", "feature", "fraction", "exp", "obs", "over_rep",
"p.value", "odds_ratio", "odds_ci")
print(over_rep_df)
over_rep_df$over_rep <- as.numeric(as.character(over_rep_df$over_rep))
over_rep_df$fraction <- as.numeric(as.character(over_rep_df$fraction))
over_rep_df$Freq <- as.numeric(as.character(over_rep_df$Freq))
over_rep_df$odds_ratio <- as.numeric(as.character(over_rep_df$odds_ratio))
p <- ggplot(over_rep_df, aes(as.numeric(Freq), over_rep, colour=feature)) +
geom_line() + my_theme
print(p)
p <- ggplot(over_rep_df, aes(as.numeric(Freq), fraction, colour=feature)) +
geom_line() + my_theme
print(p)
p <- ggplot(over_rep_df[over_rep_df$Freq>0,], aes(as.numeric(Freq), odds_ratio, colour=feature)) +
geom_point() + geom_line() + my_theme + expand_limits(y = 0)
print(p)
p <- ggplot(over_rep_df[(over_rep_df$Freq>0 & over_rep_df$feature %in% c("RBP", "glycoprotein")),],
aes(as.numeric(Freq), odds_ratio, colour=feature)) +
geom_point() + geom_line() + my_theme + expand_limits(y = 0) +
xlab("Protein observed in at least n experiments") +
ylab("Odds ratio")
print(p)
runFishersEnrichment(agg_geiger_max, "RBP", "seen_once")
FALSE TRUE
FALSE 5520 604
TRUE 1206 124
Fisher's Exact Test for Count Data
data: table(df[[col]], df[[identified_col]])
p-value = 0.5753
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
0.760614 1.153889
sample estimates:
odds ratio
0.9396978
runFishersEnrichment(agg_geiger_max, "RBP", "seen_twice")
FALSE TRUE
FALSE 4739 1385
TRUE 574 756
Fisher's Exact Test for Count Data
data: table(df[[col]], df[[identified_col]])
p-value < 2.2e-16
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
3.973062 5.111818
sample estimates:
odds ratio
4.505475
runFishersEnrichment(agg_geiger_max, "RBP", "seen_three")
FALSE TRUE
FALSE 5146 978
TRUE 690 640
Fisher's Exact Test for Count Data
data: table(df[[col]], df[[identified_col]])
p-value < 2.2e-16
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
4.287087 5.553530
sample estimates:
odds ratio
4.879162
runFishersEnrichment(agg_geiger_max, "glycoprotein", "seen_once")
FALSE TRUE
FALSE 6064 632
TRUE 662 96
Fisher's Exact Test for Count Data
data: table(df[[col]], df[[identified_col]])
p-value = 0.00658
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
1.094364 1.754705
sample estimates:
odds ratio
1.39134
runFishersEnrichment(agg_geiger_max, "glycoprotein", "seen_twice")
FALSE TRUE
FALSE 4768 1928
TRUE 545 213
Fisher's Exact Test for Count Data
data: table(df[[col]], df[[identified_col]])
p-value = 0.7033
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
0.8138505 1.1448169
sample estimates:
odds ratio
0.9665266
runFishersEnrichment(agg_geiger_max, "glycoprotein", "seen_three")
FALSE TRUE
FALSE 5227 1469
TRUE 609 149
Fisher's Exact Test for Count Data
data: table(df[[col]], df[[identified_col]])
p-value = 0.1631
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
0.7161606 1.0532736
sample estimates:
odds ratio
0.8705805
runFishersEnrichment(agg_geiger_max[agg_geiger_max$glycoprotein,], "seen_once", "signal")
FALSE TRUE
FALSE 277 385
TRUE 45 51
Fisher's Exact Test for Count Data
data: table(df[[col]], df[[identified_col]])
p-value = 0.3775
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
0.5189606 1.2851499
sample estimates:
odds ratio
0.8156389
runFishersEnrichment(agg_geiger_max[agg_geiger_max$glycoprotein,], "seen_twice", "signal")
FALSE TRUE
FALSE 255 290
TRUE 67 146
Fisher's Exact Test for Count Data
data: table(df[[col]], df[[identified_col]])
p-value = 0.00012
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
1.355145 2.722745
sample estimates:
odds ratio
1.914511
runFishersEnrichment(agg_geiger_max[agg_geiger_max$glycoprotein,], "seen_three", "signal")
FALSE TRUE
FALSE 276 333
TRUE 46 103
Fisher's Exact Test for Count Data
data: table(df[[col]], df[[identified_col]])
p-value = 0.001612
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
1.248474 2.785477
sample estimates:
odds ratio
1.854374
qm <- myProtMapper(union(all_proteins, unique(agg_geiger_max$master_protein)))
Querying chunk 1
Querying chunk 2
Querying chunk 3
Querying chunk 4
Querying chunk 5
Querying chunk 6
Querying chunk 7
Querying chunk 8
Finished
Pass returnall=TRUE to return lists of duplicate or missing query terms.
[1] "No KEGG"
the condition has length > 1 and only the first element will be used
# Mapping Uniprot to GO or Uniprot to Interpro for goseq
uniprot.go <- makeGene2Cat(qm,"query","go.all",";")
uniprot.doms <- makeGene2Cat(qm,"query","domains",";")
all_pwf <- makePWF(agg_geiger_max, "seen", "harm_mean_bias", "master_protein")
[1] 2869
initial point very close to some inequality constraints
makePWFPlot(all_pwf, xlab="U2OS abundance")
go_all <- getEnrichedGO(all_pwf, gene2cat=uniprot.go)
Using manually entered categories.
For 92 genes, we could not find any categories. These genes will be excluded.
To force their use, please run with use_genes_without_cat=TRUE (see documentation).
This was the default behavior for version 1.15.1 and earlier.
Calculating the p-values...
'select()' returned 1:1 mapping between keys and columns
print(go_all)
plotGOTerms(go_all, sum(agg_geiger_max$seen),
length(agg_geiger_max$harm_mean_bias), numObs_filter=10,
enrichment_filter=0, switch_axes=T, BH_filter=0.01)
$p
$p$MF
$p$CC
$p$BP
$g
NA
agg_geiger_max$novel <- (agg_geiger_max$Freq==3 & agg_geiger_max$RBP == FALSE)
novel_pwf <- makePWF(agg_geiger_max[agg_geiger_max$Freq==3,], "novel", "harm_mean_bias", "master_protein")
[1] 978
initial point very close to some inequality constraints
makePWFPlot(novel_pwf, xlab="U2OS abundance")
go_novel <- getEnrichedGO(novel_pwf, gene2cat=uniprot.go)
Using manually entered categories.
For 4 genes, we could not find any categories. These genes will be excluded.
To force their use, please run with use_genes_without_cat=TRUE (see documentation).
This was the default behavior for version 1.15.1 and earlier.
Calculating the p-values...
'select()' returned 1:1 mapping between keys and columns
print(table(agg_geiger_max[agg_geiger_max$Freq==3,"novel"]))
FALSE TRUE
640 978
plotGOTerms(go_novel, sum(agg_geiger_max$seen),
length(agg_geiger_max$harm_mean_bias), numObs_filter=10,
enrichment_filter=0, switch_axes=T, BH_filter=0.01)
$p
$p$all
$g
NA
SRP_targetting <- uniprot.go[uniprot.go$to.id=="GO:0006614", "query"]
print(length(intersect(agg_geiger_max[agg_geiger_max$glycoprotein==T, "master_protein"], SRP_targetting)))
[1] 0
print(length(intersect(agg_geiger_max[agg_geiger_max$signal==T, "master_protein"], SRP_targetting)))
[1] 0
twice_pwf <- makePWF(agg_geiger_max, "seen_twice", "harm_mean_bias", "master_protein")
[1] 2141
initial point very close to some inequality constraints
makePWFPlot(twice_pwf, xlab="U2OS abundance")
go_twice <- getEnrichedGO(twice_pwf, gene2cat=uniprot.go)
Using manually entered categories.
For 92 genes, we could not find any categories. These genes will be excluded.
To force their use, please run with use_genes_without_cat=TRUE (see documentation).
This was the default behavior for version 1.15.1 and earlier.
Calculating the p-values...
'select()' returned 1:1 mapping between keys and columns
go_twice <- plotGOTerms(go_twice, sum(agg_geiger_max$seen_twice), length(agg_geiger_max$harm_mean_bias),
numObs_filter=10, enrichment_filter=-Inf, switch_axes=T, BH_filter=0.01)
print(go_twice$p)
$MF
$CC
$BP
plotGOTerms
function(GO_df, len_foreground, len_background,
BH_filter=0.01, enrichment_filter=2, numObs_filter=50,
switch_axes=F, plot_top=10, term.col="term", cut_at){
# GO_df = dataframe from call to goseq
# len_foreground = total number of proteins in foreground
# len_background = total number of proteins in background
GO_df$BH <- p.adjust(GO_df$over_represented_pvalue, method="BH")
GO_df$BH[GO_df$BH==0] <- 1E-16
GO_df$enrichment <- log((GO_df$numDEInCat/GO_df$numInCat) / (len_foreground/len_background),2)
GO_filtered_df <- GO_df[GO_df$BH < BH_filter,]
GO_filtered_df <- GO_filtered_df[GO_filtered_df$enrichment > enrichment_filter,]
GO_filtered_df <- GO_filtered_df[GO_filtered_df$numDEInCat > numObs_filter,]
if(switch_axes){ # determine how long to allow GO term desc. before truncating
GO_filtered_df <- GO_filtered_df[order(GO_filtered_df$enrichment),]
if(missing(cut_at)){
cut_at <- 40
}
}
else{
GO_filtered_df <- GO_filtered_df[order(-GO_filtered_df$enrichment),]
if(missing(cut_at)){
cut_at <- 25
}
}
GO_filtered_df$term <- factor(GO_filtered_df[,term.col], levels=unique(GO_filtered_df[,term.col]))
# ------------------------
# Function\t: shortenTerm
# ------------------------
shortenTerm <- function(term, cut_at){
if(nchar(term)>cut_at){
return(paste0(substr(term, 1, cut_at), " [..]"))
}
else{
return(term)
}
}
plots = NULL
if(length(GO_filtered_df$ontology)<=plot_top){
GO_filtered_df <- GO_filtered_df[order(GO_filtered_df$ontology),]
p <- ggplot(GO_filtered_df, aes(interaction(term, ontology), enrichment, fill=log(BH,10))) +
geom_bar(stat="identity") +
xlab("") + ylab("Enrichment (Log2)") +
scale_fill_continuous(name="BH adj. p-value\\n(Log 10)\\n", low="skyblue", high="grey30") +
my_theme +
theme(
text=element_text(size=15),
plot.title=element_text(hjust=0.5)) +
scale_x_discrete(labels=paste0(sapply(as.character(GO_filtered_df[[term.col]]), FUN=function(x) shortenTerm(x, cut_at)),
" (", GO_filtered_df$ontology, ")"))
if(switch_axes){
p <- p + coord_flip()
}
else{
p <- p + theme(axis.text.x=element_text(size=12, angle=45, vjust=1, hjust=1))
}
plots[["all"]] <- p
}
else{
for(onto_class in unique(GO_filtered_df$ontology)){
tmp_df <- GO_filtered_df[GO_filtered_df$ontology == onto_class,]
if(switch_axes){
tmp_df <- tail(tmp_df,min(plot_top, length(tmp_df[,1])))
}
else{
tmp_df <- head(tmp_df,min(plot_top, length(tmp_df[,1])))
}
tmp_df$short_term <- sapply(as.character(tmp_df$term), FUN=function(x) shortenTerm(x, cut_at))
p <- ggplot(tmp_df, aes(term, enrichment, fill=log(BH,10))) +
geom_bar(stat="identity") +
xlab("") + ylab("Enrichment (Log2)") +
scale_fill_continuous(name="BH adj. p-value\\n(Log 10)\\n", low="skyblue", high="grey30") +
ggtitle(onto_class) +
my_theme +
theme(
text=element_text(size=15),
plot.title=element_text(hjust=0.5)) +
scale_x_discrete(labels=tmp_df$short_term)
if(switch_axes){
p <- p + coord_flip()
}
else{
p <- p + theme(axis.text.x=element_text(size=12, angle=45, vjust=1, hjust=1))
}
plots[[onto_class]] <- p
}
}
return(list(p=plots,g=GO_df))
}
<bytecode: 0x2e1523a8>
three_pwf <- makePWF(agg_geiger_max, "seen_three", "harm_mean_bias", "master_protein")
[1] 1618
initial point very close to some inequality constraints
makePWFPlot(twice_pwf, xlab="U2OS abundance")
go_three <- getEnrichedGO(three_pwf, gene2cat=uniprot.go)
Using manually entered categories.
For 92 genes, we could not find any categories. These genes will be excluded.
To force their use, please run with use_genes_without_cat=TRUE (see documentation).
This was the default behavior for version 1.15.1 and earlier.
Calculating the p-values...
'select()' returned 1:1 mapping between keys and columns
go_three <- plotGOTerms(go_three, sum(agg_geiger_max$seen_three), length(agg_geiger_max$harm_mean_bias),
numObs_filter=10, enrichment_filter=-Inf, switch_axes=T, BH_filter=0.01)
print(go_twice$p)
$MF
$CC
$BP
once_pwf <- makePWF(agg_geiger_max, "seen_once", "harm_mean_bias", "master_protein")
[1] 728
initial point very close to some inequality constraints
makePWFPlot(once_pwf, xlab="U2OS abundance")
go_once <- getEnrichedGO(once_pwf, gene2cat=uniprot.go)
Using manually entered categories.
For 92 genes, we could not find any categories. These genes will be excluded.
To force their use, please run with use_genes_without_cat=TRUE (see documentation).
This was the default behavior for version 1.15.1 and earlier.
Calculating the p-values...
'select()' returned 1:1 mapping between keys and columns
go_once <- plotGOTerms(go_once, sum(agg_geiger_max$seen_once), length(agg_geiger_max$harm_mean_bias),
numObs_filter=10, enrichment_filter=-Inf, switch_axes=T, BH_filter=0.01)
print(go_once$p)
$all
domain_twice.abundance <- goseq(twice_pwf, gene2cat=uniprot.doms)
Using manually entered categories.
For 114 genes, we could not find any categories. These genes will be excluded.
To force their use, please run with use_genes_without_cat=TRUE (see documentation).
This was the default behavior for version 1.15.1 and earlier.
Calculating the p-values...
domain_twice.abundance$BH <- p.adjust(domain_twice.abundance$over_represented_pvalue, method="BH")
print(head(domain_twice.abundance))
domain_once.abundance <- goseq(once_pwf, gene2cat=uniprot.doms)
Using manually entered categories.
For 114 genes, we could not find any categories. These genes will be excluded.
To force their use, please run with use_genes_without_cat=TRUE (see documentation).
This was the default behavior for version 1.15.1 and earlier.
Calculating the p-values...
domain_once.abundance$BH <- p.adjust(domain_once.abundance$over_represented_pvalue, method="BH")
print(head(domain_once.abundance))
go_merged <- merge(go_three$g[,c("category", "term", "over_represented_pvalue", "enrichment")],
go_once$g[,c("category", "term", "over_represented_pvalue", "enrichment")],
by="category")
sig_MF_terms <- go_three$g[(go_three$g$BH<0.01 & go_three$g$ontology=="MF"), "term"]
go_merged$sig_MF_term <- go_merged$term.x %in% sig_MF_terms
go_merged$label <- go_merged$term.x
go_merged$label[go_merged$sig_MF_term==F] <- NA
p <- ggplot(go_merged[go_merged$category %in% go_twice$g[go_three$g$BH<0.01,"category"],],
aes(log(over_represented_pvalue.x,10), log(over_represented_pvalue.y,10))) +
geom_point() + geom_smooth(method="lm") + my_theme
print(p)
p <- ggplot(go_merged[(go_merged$category %in% go_three$g[go_three$g$BH<0.01,"category"] &
go_merged$enrichment.x > 0),],
aes(enrichment.x, enrichment.y)) +
geom_point(aes(colour=sig_MF_term)) + geom_smooth(method="lm") +
geom_text(aes(label=label), nudge_x = -0.2, nudge_y = 0.3, colour="red",
check_overlap = TRUE, hjust = 1, size=3.4) +
scale_colour_manual(values=c("black", "red"), guide=F) +
my_theme
print(p)
twice_glyco_pwf <- makePWF(agg_geiger_max, "seen_twice_glycoproteins", "harm_mean_bias", "master_protein")
[1] 213
initial point very close to some inequality constraints
makePWFPlot(twice_glyco_pwf, xlab="U2OS abundance")
go_twice_glyco <- getEnrichedGO(twice_glyco_pwf, gene2cat=uniprot.go)
Using manually entered categories.
For 92 genes, we could not find any categories. These genes will be excluded.
To force their use, please run with use_genes_without_cat=TRUE (see documentation).
This was the default behavior for version 1.15.1 and earlier.
Calculating the p-values...
'select()' returned 1:1 mapping between keys and columns
go_twice_glyco <- plotGOTerms(go_twice_glyco, sum(agg_geiger_max$seen_twice_glycoproteins),
length(agg_geiger_max$harm_mean_bias),
numObs_filter=10, enrichment_filter=1, switch_axes=T, BH_filter=0.01)
print(go_twice_glyco$p)
$CC
$MF
$BP
protein_desc <- read.csv("../../../shared_files/human_protein_ids.tsv", sep="\t", header=F)
colnames(protein_desc) <- c("UniprotID", "Symbol", "Description")
agg_geiger_max <- merge(agg_geiger_max, protein_desc, by.x="master_protein", by.y="UniprotID", all.x=T)
SRP_targetting <- uniprot.go[uniprot.go$to.id=="GO:0006614", "query"]
tmp_df <- agg_geiger_max[agg_geiger_max$master_protein %in% SRP_targetting,]
tmp_df <- tmp_df[order(-tmp_df$Freq),]
print(tmp_df)
print(tmp_df[,c("Freq", "Description", "svm", "svm.scores", "Length")])
print(tmp_df[tmp_df$Freq>0,"Description"])
[1] Signal recognition particle subunit SRP72 (SRP72) (Signal recognition particle 72 kDa protein)
[2] 60S acidic ribosomal protein P2 (Large ribosomal subunit protein P2) (Renal carcinoma antigen NY-REN-44)
[3] 60S acidic ribosomal protein P0 (60S ribosomal protein L10E) (Large ribosomal subunit protein uL10)
[4] Signal recognition particle receptor subunit alpha (SR-alpha) (Docking protein alpha) (DP-alpha)
[5] 40S ribosomal protein S2 (40S ribosomal protein S4) (Protein LLRep3) (Small ribosomal subunit protein uS5)
[6] 60S ribosomal protein L7 (Large ribosomal subunit protein uL30)
[7] 40S ribosomal protein S3 (EC 4.2.99.18) (Small ribosomal subunit protein uS3)
[8] 40S ribosomal protein S12 (Small ribosomal subunit protein eS12)
[9] 60S ribosomal protein L13 (Breast basic conserved protein 1) (Large ribosomal subunit protein eL13)
[10] 60S ribosomal protein L10 (Laminin receptor homolog) (Large ribosomal subunit protein uL16) (Protein QM) (Ribosomal protein L10) (Tumor suppressor QM)
[11] 60S ribosomal protein L12 (Large ribosomal subunit protein uL11)
[12] 60S ribosomal protein L22 (EBER-associated protein) (EAP) (Epstein-Barr virus small RNA-associated protein) (Heparin-binding protein HBp15) (Large ribosomal subunit protein eL22)
[13] 60S ribosomal protein L4 (60S ribosomal protein L1) (Large ribosomal subunit protein uL4)
[14] 40S ribosomal protein S19 (Small ribosomal subunit protein eS19)
[15] 60S ribosomal protein L3 (HIV-1 TAR RNA-binding protein B) (TARBP-B) (Large ribosomal subunit protein uL3)
[16] 60S ribosomal protein L35 (Large ribosomal subunit protein uL29)
[17] 60S ribosomal protein L27a (Large ribosomal subunit protein uL15)
[18] 60S ribosomal protein L5 (Large ribosomal subunit protein uL18)
[19] 60S ribosomal protein L21 (Large ribosomal subunit protein eL21)
[20] 60S ribosomal protein L28 (Large ribosomal subunit protein eL28)
[21] 40S ribosomal protein S9 (Small ribosomal subunit protein uS4)
[22] 40S ribosomal protein S10 (Small ribosomal subunit protein eS10)
[23] 60S ribosomal protein L29 (Cell surface heparin-binding protein HIP) (Large ribosomal subunit protein eL29)
[24] 60S ribosomal protein L34 (Large ribosomal subunit protein eL34)
[25] 60S ribosomal protein L14 (CAG-ISL 7) (Large ribosomal subunit protein eL14)
[26] 40S ribosomal protein S20 (Small ribosomal subunit protein uS10)
[27] Signal recognition particle 54 kDa protein (SRP54)
[28] 40S ribosomal protein S3a (Small ribosomal subunit protein eS1) (v-fos transformation effector protein) (Fte-1)
[29] 60S ribosomal protein L26 (Large ribosomal subunit protein uL24)
[30] 60S ribosomal protein L15 (Large ribosomal subunit protein eL15)
[31] 60S ribosomal protein L37 (G1.16) (Large ribosomal subunit protein eL37)
[32] 40S ribosomal protein S7 (Small ribosomal subunit protein eS7)
[33] 40S ribosomal protein S8 (Small ribosomal subunit protein eS8)
[34] 40S ribosomal protein S15a (Small ribosomal subunit protein uS8)
[35] 40S ribosomal protein S16 (Small ribosomal subunit protein uS9)
[36] 40S ribosomal protein S14 (Small ribosomal subunit protein uS11)
[37] 40S ribosomal protein S23 (Small ribosomal subunit protein uS12)
[38] 40S ribosomal protein S18 (Ke-3) (Ke3) (Small ribosomal subunit protein uS13)
[39] 40S ribosomal protein S13 (Small ribosomal subunit protein uS15)
[40] 40S ribosomal protein S11 (Small ribosomal subunit protein uS17)
[41] 60S ribosomal protein L7a (Large ribosomal subunit protein eL8) (PLA-X polypeptide) (Surfeit locus protein 3)
[42] 40S ribosomal protein S4, X isoform (SCR10) (Single copy abundant mRNA protein) (Small ribosomal subunit protein eS4)
[43] 40S ribosomal protein S6 (Phosphoprotein NP33) (Small ribosomal subunit protein eS6)
[44] 60S ribosomal protein L23 (60S ribosomal protein L17) (Large ribosomal subunit protein uL14)
[45] 40S ribosomal protein S24 (Small ribosomal subunit protein eS24)
[46] 60S ribosomal protein L30 (Large ribosomal subunit protein eL30)
[47] 60S ribosomal protein L31 (Large ribosomal subunit protein eL31)
[48] 60S ribosomal protein L10a (CSA-19) (Large ribosomal subunit protein uL1) (Neural precursor cell expressed developmentally down-regulated protein 6) (NEDD-6)
[49] 60S ribosomal protein L11 (CLL-associated antigen KW-12) (Large ribosomal subunit protein uL5)
[50] 60S ribosomal protein L8 (Large ribosomal subunit protein uL2)
[51] 60S ribosomal protein L24 (60S ribosomal protein L30) (Large ribosomal subunit protein eL24)
[52] 60S ribosomal protein L19 (Large ribosomal subunit protein eL19)
[53] 60S ribosomal protein L18a (Large ribosomal subunit protein eL20)
[54] 60S ribosomal protein L6 (Large ribosomal subunit protein eL6) (Neoplasm-related protein C140) (Tax-responsive enhancer element-binding protein 107) (TaxREB107)
[55] Translocation protein SEC63 homolog
[56] Signal recognition particle subunit SRP68 (SRP68) (Signal recognition particle 68 kDa protein)
[57] 60S ribosomal protein L27 (Large ribosomal subunit protein eL27)
[58] 60S ribosomal protein L23a (Large ribosomal subunit protein uL23)
[59] 40S ribosomal protein S26 (Small ribosomal subunit protein eS26)
[60] 60S ribosomal protein L36 (Large ribosomal subunit protein eL36)
[61] 40S ribosomal protein SA (37 kDa laminin receptor precursor) (37LRP) (37/67 kDa laminin receptor) (LRP/LR) (67 kDa laminin receptor) (67LR) (Colon carcinoma laminin-binding protein) (Laminin receptor 1) (LamR) (Laminin-binding protein precursor p40) (LBP/p40) (Multidrug resistance-associated protein MGr1-Ag) (NEM/1CHD4) (Small ribosomal subunit protein uS2)
[62] 40S ribosomal protein S5 (Small ribosomal subunit protein uS7) [Cleaved into: 40S ribosomal protein S5, N-terminally processed]
[63] 40S ribosomal protein S25 (Small ribosomal subunit protein eS25)
[64] 40S ribosomal protein S28 (Small ribosomal subunit protein eS28)
[65] Ubiquitin-40S ribosomal protein S27a (Ubiquitin carboxyl extension protein 80) [Cleaved into: Ubiquitin; 40S ribosomal protein S27a (Small ribosomal subunit protein eS31)]
20244 Levels: (E3-independent) E2 ubiquitin-conjugating enzyme (EC 2.3.2.24) (E2/E3 hybrid ubiquitin-protein ligase UBE2O) (Ubiquitin carrier protein O) (Ubiquitin-conjugating enzyme E2 O) (Ubiquitin-conjugating enzyme E2 of 230 kDa) (Ubiquitin-conjugating enzyme E2-230K) (Ubiquitin-protein ligase O) ...
print(tmp_df[tmp_df$Freq==0,"Description"])
[1] 60S acidic ribosomal protein P1 (Large ribosomal subunit protein P1)
[2] 40S ribosomal protein S17 (Small ribosomal subunit protein eS17)
[3] Signal recognition particle 19 kDa protein (SRP19)
[4] 60S ribosomal protein L35a (Cell growth-inhibiting gene 33 protein) (Large ribosomal subunit protein eL33)
[5] 60S ribosomal protein L17 (60S ribosomal protein L23) (Large ribosomal subunit protein uL22) (PD-1)
[6] 60S ribosomal protein L9 (Large ribosomal subunit protein uL6)
[7] Signal recognition particle 14 kDa protein (SRP14) (18 kDa Alu RNA-binding protein)
[8] 60S ribosomal protein L13a (23 kDa highly basic protein) (Large ribosomal subunit protein uL13)
[9] Signal recognition particle 9 kDa protein (SRP9)
[10] 60S ribosomal protein L37a (Large ribosomal subunit protein eL43)
[11] Protein transport protein Sec61 subunit alpha isoform 1 (Sec61 alpha-1)
[12] 40S ribosomal protein S29 (Small ribosomal subunit protein uS14)
[13] 40S ribosomal protein S15 (RIG protein) (Small ribosomal subunit protein uS19)
[14] 60S ribosomal protein L32 (Large ribosomal subunit protein eL32)
[15] Ubiquitin-60S ribosomal protein L40 (CEP52) (Ubiquitin A-52 residue ribosomal protein fusion product 1) [Cleaved into: Ubiquitin; 60S ribosomal protein L40 (Large ribosomal subunit protein eL40)]
[16] 60S ribosomal protein L38 (Large ribosomal subunit protein eL38)
[17] 40S ribosomal protein S21 (Small ribosomal subunit protein eS21)
[18] 60S ribosomal protein L36a (60S ribosomal protein L44) (Cell growth-inhibiting gene 15 protein) (Cell migration-inducing gene 6 protein) (Large ribosomal subunit protein eL42)
[19] 60S ribosomal protein L18 (Large ribosomal subunit protein eL18)
[20] Translocon-associated protein subunit gamma (TRAP-gamma) (Signal sequence receptor subunit gamma) (SSR-gamma)
20244 Levels: (E3-independent) E2 ubiquitin-conjugating enzyme (EC 2.3.2.24) (E2/E3 hybrid ubiquitin-protein ligase UBE2O) (Ubiquitin carrier protein O) (Ubiquitin-conjugating enzyme E2 O) (Ubiquitin-conjugating enzyme E2 of 230 kDa) (Ubiquitin-conjugating enzyme E2-230K) (Ubiquitin-protein ligase O) ...
print(tmp_df[grep("Signal", tmp_df$Description),c("master_protein", "Freq", "Description", "svm", "svm.scores", "Length")])
tmp_df <- agg_geiger_max[(#agg_geiger_max$glycoprotein==T &
#agg_geiger_max$seen_all==T &
agg_geiger_max$signal==T &
agg_geiger_max$svm=="PM" &
agg_geiger_max$svm.scores>0.9),]
tmp_df <- tmp_df[!is.na(tmp_df$master_protein),]
tmp_df[order(tmp_df$Length),]